
********************************************* 
** Stata code to prepare data for analysis **
*********************************************

*** Preliminaries ***
** Compustat **

use "\other\Compustat.dta", clear

desc
summ datadate, format
tab indfmt

destring gvkey, replace
duplicates tag gvkey datadate, gen(dup)

tab dup
preserve
drop if dup==1
restore

/* There are  46838 obs with one duplicate based on gvkey and datadate,
so we can get rid of 46838 obs */
order gvkey datadate dup

preserve
/* Identify and keep all obs for gvkeys that have a duplicate. */
gen keeper=0
levelsof gvkey if dup==1, local(levels)
foreach x of local levels {
replace keeper=1 if `x' == gvkey
}

/* Tabulate dup before and after dropping obs without duplicates
as a logic check. */
tab dup
keep if keeper==1
tab dup

/* Duplicates are mostly for INDFMT's equal to INDL and FS for the
same firm. These are two different reporting formats that, per
COMPUSTAT, allow firms that are non-financial services to also report
in a finacial services format. Per an eyeball inspection of the data,
most of the FS line items are mostly missing values. The goal is to
keep whichever obs has the least missing values */
restore

egen mc= rowmiss(bkvlps- prcc_f)

/* intermediate step to create a variable equal to the smallest missing
value count within each gvkey and datadate */

bys gvkey datadate: egen minmc= min(mc)
gen del= 0
replace del=1 if dup != 0 & gvkey[_n]== gvkey[_n+1] & minmc != mc | ///
dup !=0 & gvkey[_n]== gvkey[_n-1] & minmc != mc

/* first part of OR statement above will tag all obs unless the last
one for the firm is the one that should be deleted. The code after the OR
operator takes care of marking the last obs for deletion within the firm if
that is one that should be deleted */

order gvkey datadate dup mc minmc del

tab indfmt if del==1

list gvkey sic if del==1 & indfmt== "INDL"

/* Per OSHA SIC codes, SIC codes 60-67 are Finance, Insurance,
and Real Estate. Makes sense that these firms have more missing values for
INDL than FS in the indfmt variable. */

destring sic, replace

count if sic > 5999 & sic < 6800
/* 137,605 Division H, Finance, Insurancec, Real Estate firms */

tab indfmt if sic >5999 & sic < 6800
/* but looks like most report their results using INDL and not FS */

count if del==1 & indfmt== "INDL"

drop if del==1

/* Above we dropped 35,927 of the 36,852 duplicates. There are 925
remaining duplicates based on gvkey and datadate based on our
previous calculation. */

duplicates tag gvkey datadate, gen(dup2)
tab dup2

gen tie_gen=0
bys gvkey datadate: replace tie_gen=1 if mc[_n]== mc[_n+1]
/* within gvkey and datadate's, identify tie_gens of missing value sums. This will
tag one of the obs, and here we don't care which we retain since they have
the same number of missing values (unless some are more important than others,
but no reason to think that in this case), so we can drop either one.
Note that this only marks one for deletion and not both */
order gvkey datadate dup dup2 mc minmc del tie_gen

drop dup2
drop if tie_gen==1

/* Now that duplicate datadates have been taken care of, create
a variable equal to the year component of datadate so that an
annual xtset may be done */
gen year= year(datadate)
duplicates tag gvkey year, gen(ydup)
tab ydup

/* While gkvey datadate is now unique, gvkey year is not */
preserve
keep if ydup==1

/* An eyeball inspection shows that these duplicates are for firms
who changed their financial reporting period year-ends. */

list gvkey datadate year fyear in 1/10

/* Note that fyear has been defined by COMPUSTAT to attempt to
compensate for this behavior */

restore

count

count if fyear==.
/* The 723 missing values above will trigger an error if we try to xtset
based on gvkey and fyear now. We therefore replace missing fyear based
on the COMPUSTAT data definition */
replace fyear= year(datadate)-1 if fyear== . & month(datadate) <=5
replace fyear= year(datadate) if fyear==. & month(datadate) >5

/* All 723 missing fyears have been taken care of above. Next, we check
for duplicates */

duplicates tag gvkey fyear, gen(dup4)
tab dup4

/* There are 36/2= 18 duplicates. This is explored below */
list gvkey datadate fyear if dup4==1
/* These appear to be firms that switched financial reporting periods
but ended up with the same fyear based on the old and new year-ends.
Eyeball inspection shows that there are many missing values for one each of
the years associated in each of these obs. We will drop whichever
has the most missing values using a modification of the minmc code
used previously. */

/* The original minmc code needs to be modified since minmc was originally
caluclated based on two observations for firms with the same DATADATE.
Here, these firms switched financial reporting period-ends, but not to the
extent that the FYEAR variable changed. Need to create a variable equal to the
year component for each obs and re-calculate minmc. */

drop minmc
bys gvkey fyear: egen minmc= min(mc)
order gvkey datadate fyear dup mc minmc del tie_gen
replace del=1 if dup4==1 & gvkey[_n]== gvkey[_n+1] & minmc != mc | ///
dup4 ==1 & gvkey[_n]== gvkey[_n-1] & minmc != mc
drop if del==1

/* similar to before, we also need to deal with any potential tie_gens */

bys gvkey fyear: replace tie_gen=1 if mc[_n]== mc[_n+1]
drop if tie_gen==1

xtset gvkey fyear, yearly
drop year
rename fyear year 
count

merge 1:1 gvkey year using "emp_sumseg" 
drop if _merge == 2
drop _merge
replace emp = emp * 1000
replace emp = emp_sumseg if mi(emp) & !mi(emp_sumseg)
*(862 real changes made)
bysort gvkey: ipolate emp year, gen (emp_ipolated)
label variable emp_ipolated "Number of employees (interpolated)"
drop emp_sumseg
/*
replace gdp_defltr = gdp_defltr / 100 
replace gdpi_defltr = gdpi_defltr/ 100
gen at_adj = at/gdp_defltr 
gen ppe_adj = ppent/gdp_defltr 
*/
gen debt = dlc + dltt 
gen debt_current_ratio = dlc / at
gen debt_ratio = debt/at
gen leverage = debt/seq 
gen tobin = (debt + prcc_f*csho) / at 

destring sic,replace
gen sic2=int(sic/100)
gen sic3=int(sic/10)
gen ins_perc = cshi / cstk 
gen ex_fin = fincf / xrd 
gen liquidity = che/ at 
/*
gen revt_adj = revt/gdp_defltr
gen ebitda_adj = ebitda/gdp_defltr
gen sale_adj = sale/gdp_defltr
gen xad_adj = xad/gdp_defltr
*/

xtset gvkey year
gen change = txditc-l.txditc
gen cash_at = che/at
**Cash and Short-Term Investments
gen free_cash = ivncf + oancf
gen CF_rate = (oibdp-(txt-change)-dvp-dvc)/at
**Operating Income Before Depreciation minus tax plus foreign exchange income loss minus dividends minus
**
label var CF_rate "Free cash flow"
bysort sic2: egen xrd_avg = mean(xrd) 
gen xrd1 = xrd
replace xrd1 = 0 if mi(xrd)  
gen xrd2 = xrd
replace xrd2 = xrd_avg if mi(xrd2) 
label var xrd1 "R&D expenditure (missing to 0)"
label var xrd2 "R&D expenditure (missing to industry average)"
gen byte mi_rd = cond(mi(xrd), 1, 0)

gen rd_int1 = xrd/sale
bysort sic2 : egen rd_median = median(rd_int) 
gen rd_int2 = rd_int 
replace rd_int2 = rd_median if mi(xrd)  
replace rd_int1 = 0 if mi(xrd)
label var rd_int1 "R&D intensity (missing to 0)"
label var rd_int2 "R&D intensity (missing to industry median)"

xtset gvkey year
gen mtb = mkvalt/bkvlps
gen ppenme_ratio = ppenme/at  
gen revt_per = revt/emp_ipolated
gen ebitda_per = ebitda/emp_ipolated
gen ppe_per = ppegt/emp_ipolated
gen capex_ratio = capx/at

gen roa= ebitda / at 
*drop if ppe_per < 0
gen neg_revt = cond(revt < = 0, 1, 0)

winsor2 at revt tobin ppegt roa mtb ppenme_ratio capex_ratio ///
 leverage ebitda revt_per capx    xrd1 xrd2 , replace

xtset gvkey year 
gen ppe_per_inh = asinh(ppe_per)
gen revt_per_inh = asinh(revt_per)
gen capx_inh = asinh(capx)
gen at_ln = ln(at)
gen tobin_ln = ln(tobin)
gen xrd1_inh = asinh(xrd1)
gen emp_ln = ln(emp_ipolated) 
gen emp_inh = asinh(emp_ipolated)
gen defense1 = inlist(sic, 3480, 3760)
gen defense2 = inlist(sic, 3480, 3510, 3720, 3721, 3724, 3728, 3760, 3812)

keep gvkey-year at ebitda ebit ppegt revt xrd xad xsga ///
 xstf xstfws xt naicsh sich rank au auop auopic addzip busdesc city /// 
 emp conml county naics spcindcd sic spcseccd spcsrc state /// 
 ipodate emp_ipolated-defense2 naics sic cik ivncf oancf che defense1 defense2 ///
 dvdnp cdvc dv dvc dvdnp dvintf dvp dvpa dvpd dvt dvsco dvrpiv dvpdp

gen div_yes = cond(dvt != 0, 1, 0)
label var div_yes "Pay any dividends"

rename state state_code
merge m:1 state_code year using "fedapp_yrs.dta"
drop if _merge == 2
drop _merge 
rename circuit cmp_circuit 
rename state_code cmp_state
save "cmpst_proed.dta", replace


** Patent -- assignee, app and grant year **
use "\NBER\pat76_06_assg.dta" , clear
keep patent pdpass gyear appyear 
duplicates drop 
save "pat_gappyr.dta", replace

** Litigation by subcat **
import delimited using "\patent\LitigatedPatents.csv" , clear
keep patent masterindex class subclass claims patdef cafcera dcdecision dcstate ///
  district distdate affirmed acdecision acdate firstac dc2nddec dc2nddecdate ///
  ac2nddecision ac2nddecdate invstate assstate numinventors dcnumpatents ///
  acnumpatents issuedate  continuation division assignee

* Date of decision at different courts *
replace distdate = "" if distdate == "unknown"
replace distdate = "1995" if  distdate == "//1995"
split(distdate), p(/)
rename distdate3 dcyear
destring dcyear, replace

rename distdate1 dcmonth 
destring dcmonth, replace
*br dc* if dcmonth >= 12 & !mi(dcmonth)
replace dcyear = 1995 if dcmonth == 1995
replace dcmonth = . if dcmonth == 1995

rename distdate2 dcday
replace dcday = "" if dcday == " "
destring dcday, replace

gen dc = dcmonth if dcmonth > 12
replace dc = dc - 21916
format dc %td
replace dcyear = year(dc) if !mi(dc)
replace dcmonth = month(dc) if !mi(dc)
replace dcday = day(dc) if !mi(dc)
drop dc 

gen lit = 1 
destring patent, replace force

joinby patent using "\NBER\pat76_06_assg.dta"

keep patent dcyear gyear appyear subcat cat 
duplicates drop 
keep if dcyear <= 1985 
bysort subcat: egen subcat_npatlit = nvals(patent)
bysort cat: egen cat_npatlit = nvals(patent)
keep cat subcat subcat_npatlit cat_npatlit
duplicates drop 
save "patlit_subcat.dta", replace 

** Match patent with gvkey **
use "\NBER\pat76_06_assg.dta", clear 
rename appyear year

merge m:1 pdpass using "\NBER\dynass.dta"
tab _m
keep if _m == 3
gen gvkey =.

forvalue i = 1/5 {
replace gvkey = gvkey`i' if gvkey`i'~=. & year>=begyr`i' & year<=endyr`i'
}
drop if mi(gvkey)
drop _merge
drop begyr1-endyr5

keep patent gvkey year 
duplicates drop 
compress 
save "patgvkey.dta" ,replace 

** Patent citations **
use "\NBER\cite76_06.dta", clear  
 
rename cited patent 
joinby patent using "pat_gappyr.dta"
rename gyear cited_gyr 
rename appyear cited_appyr 
rename pdpass cited_pdpass 

joinby patent using "patgvkey.dta"
rename patent cited 
rename gvkey cited_gvkey

rename citing patent 
joinby patent using "pat_gappyr.dta" ,  unmatched(master)  
rename gyear citing_gyr 
rename appyear citing_appyr 
rename pdpass citing_pdpass 
drop _merge 

joinby patent using "patgvkey.dta"  ,  unmatched(master) 
rename gvkey citing_gvkey 
drop year 
drop ncites7606 
rename patent citing_patent 
drop _merge 

gen self_cite = 1 if cited_gvkey == citing_gvkey & !mi(citing_gvkey) 
replace self_cite = 0 if mi(self_cite) 

gen gap = citing_appyr - cited_gyr 
gen threeyr = cond(gap <= 3, 1, 0 )

preserve 
keep if self_cite == 0 
bysort cited : gegen cites_others = nvals(citing_patent) 
keep cited cites_others 
duplicates drop 
rename cited patent 
compress
save "\patent\cites_others.dta", replace 
restore 

preserve
keep if self_cite == 0
keep if threeyr == 1
bysort cited: gegen cites_others_3yr = nvals(citing_patent)
keep cited cites_others_3yr
duplicates drop 
rename cited patent 
compress
save "\patent\ncites_3yr.dta", replace 
restore

preserve 
keep if self_cite == 1
keep if threeyr == 1
bysort cited: gegen cites_self_3yr = nvals(citing_patent)
keep cited cites_self_3yr
duplicates drop 
rename cited patent 
compress
save "\patent\selfncites_3yr.dta", replace 
restore 


** Patent citations to science (Marx and Fuegi, SMJ 2020) **
import delimited "$patent\pcs_mag.tsv", clear
keep if confscore == 10 
duplicates drop patent magid, force
bysort patent: gegen pcs = count(magid)
keep patent pcs
duplicates drop patent pcs, force
rename patent patfam
gegen patent = sieve(patfam), keep(numeric)
destring patent, replace
save "\patent\pat_pcs.dta" , replace


**************************** 
*** Assemble the dataset ***
****************************

use "\NBER\pat76_06_assg.dta" , clear
keep if country == "US"
*This line is revised from the previous day
merge m:1 pdpass using "\NBER\dynass.dta"
tab _m
keep if _m == 3
gen gvkey =.
forvalue i = 1/5 {
replace gvkey = gvkey`i' if gvkey`i'~=. & appyear>=begyr`i' & appyear<=endyr`i'
}
**should keep these patents in the dataset, just in case kpss has matching 
drop _merge
drop begyr1-endyr5

duplicates drop gvkey patent, force  
tostring gvkey , replace
rename subclass subclass_nber
rename state assstate
rename gyear year
destring gvkey, replace

bysort patent: egen n_ass = nvals(pdpass)

merge m:1 patent using  "\...\kpss-xi.dta"
drop if _merge == 2
drop _merge 

rename permno KPSSpermno
merge m:1 gvkey using "\Compustat\gvkey-permno-link.dta"
gen permno = . 
forvalues i = 1/8{
 replace permno = lpermno`i' if !mi(lpermno`i')  & year >= begyr`i' & year <= ///
 endyr`i'    
}
drop if _merge == 2
drop _merge 
drop lpermno1-endd8
rename permno NBERpermno 
rename gvkey NBERgvkey 

gen gvkey = . 
rename KPSSpermno lpermno 
merge m:1 lpermno using "\Compustat\permno-gvkey-link.dta" 
drop if _merge == 2
forvalues i = 1/8{
 replace gvkey = gvkey`i' if !mi(gvkey`i')  & year >= begyr`i' & year <= ///
 endyr`i'    
}
drop gvkey1-_merge
rename gvkey KPSSgvkey 
rename lpermno KPSSpermno

** For multiple assignees, patent value is based on one assignee **
bysort patent (xi): gen seq = _n
bysort patent: gen seqt = _N
bysort patent: egen KPSSpermno_mean = mean(KPSSpermno)
bysort patent: egen permno_mean = mean(NBERpermno) 
bysort patent: egen xi_mean =  mean(xi)
gen check1 = KPSSpermno - NBERpermno 
gen check2 = KPSSgvkey - NBERgvkey 

**step 1: for patents value that are matched 

** Single assignee patent
** (a) missing patent values
*br patent KPSSpermno NBERpermno NBERgvkey KPSSgvkey KPSSpermno_mean permno_mean xi n_ass n_gvkey if mi(xi) & n_ass == 1	
drop if mi(xi) & n_ass == 1
** (b) not missing but NBER permno does not equal KPSS permno 
*br patent KPSSpermno NBERpermno NBERgvkey KPSSgvkey xi n_ass n_gvkey check1 check2 if !mi(xi) & n_ass == 1 & check1 != 0 & !mi(NBERgvkey) & !mi(KPSSpermno)
**taking NBER matching results as the default 
**there are 21,467 such patents, out of 743,725, having NBER and KPSS matched to different companies and yield non-missing patent values 
gen gvkey = NBERgvkey 
replace gvkey = KPSSgvkey if mi(NBERgvkey)
** in most cases, the permno gvkey link is not proper

** Multiple assignee patent 
** (a) missing patent values 
*br patent KPSSpermno NBERpermno KPSSpermno_mean permno_mean xi n_ass n_gvkey if check1 != 0 & n_ass != 1	& xi_mean == .
drop if mi(xi_mean) & n_ass != 1   
** (b) NBER multiple assignee, matched to different gvkey but only one permno 
** replace xi with inconsistent permno to missing as KPSS calculates patent
** value for one assignee
*br patent gvkey xi KPSSpermno permno KPSSpermno_mean permno_mean ///
*br patent KPSSpermno NBERpermno KPSSpermno_mean permno_mean xi n_ass n_gvkey if n_ass != 1 & xi_mean != .  & seqt != 1 
bysort patent: egen check1_min = min(check1)
drop if !mi(xi_mean) & check1_min == 0 & n_ass != 1 & check1 != 0 

*br patent KPSSpermno NBERpermno KPSSpermno_mean permno_mean xi n_ass n_gvkey if n_ass != 1 & xi_mean != .  & check1_min != 0
bysort patent: egen check1_max = max(check1)
*br patent KPSSpermno NBERpermno KPSSpermno_mean permno_mean xi n_ass n_gvkey if n_ass != 1 & xi_mean != .  & check1_min != 0
drop if !mi(xi_mean) & check1_max == 0 & n_ass != 1 & check1 != 0 

*br patent KPSSpermno NBERpermno KPSSpermno_mean permno_mean xi n_ass n_gvkey if n_ass != 1 & xi_mean != .  & check1 != 0
drop if mi(gvkey) & n_ass != 1 & xi_mean != .  & check1 != 0

duplicates tag patent, gen(check_ct)
drop if xi_mean == . 

*Drop patents for which gvkey changed within patent grant year**
bysort patent: egen n_gvkey = nvals(gvkey)
drop if check_ct !=0  & mi(NBERgvkey)
bysort patent: egen n_kpssgvkey = nvals(KPSSgvkey)
replace gvkey = KPSSgvkey if n_kpssgvkey == 1 & n_gvkey != 1 
drop if seq != seqt & check_ct != 0

duplicates report patent 
drop check* 
drop seq seqt xi_mean 
drop NBERgvkey KPSSgvkey n_gvkey n_kpssgvkey 
drop *permno 


* Patent novelty * 
rename patent publn_nr
merge 1:1 publn_nr using "\patent\patent_novelty.dta"
drop if _merge == 2
drop _merge 

* Patent citations to science * 
rename publn_nr patent 
merge 1:1 patent using "\patent\patent_pcs.dta"
drop if _merge == 2
foreach i of varlist nbodyonly-nfrontonly{
 replace `i' = 0 if mi(`i')
}
drop _merge

* Patent claims *
tostring patent, replace
merge 1:1 patent using "\patent\patent_document_stats.dta"
drop if _merge == 2
drop _merge

destring patent, replace 
merge 1:1 patent using "\patent\cites_others.dta"
drop if _merge == 2
replace cites_others = 0 if _merge == 1
drop _merge 
label var cites_others "Total number of forward citations made by other businesses"

destring patent, replace
merge 1:1 patent using "\patent\ncites_3yr.dta" 
drop if _merge == 2
replace cites_others_3yr = 0 if _merge == 1
drop _merge 

compress 
save "patval_prep.dta"  , replace

 
** Merge patents with Compustat **
use "patval_prep.dta" , clear 
destring gvkey, replace
merge m:1 gvkey year using "cmpst_proed.dta"
keep if _merge == 3  // no controls after 2000
drop _merge 
rename cmp_state cmp_state_code
rename state cmp_state

** Alternative versions of CAFC index ** 
rename assstate state_code
merge m:1 state_code year using "fedapp_yrs.dta"
keep if _merge == 3 // AB, BC, MB, ON, QC and SK are dropped
drop _merge

merge m:1 circuit using "CAFC_ct_updated.dta"
keep if _merge == 3 //DC dropped 
drop _merge 

gen CAFC1 = ratepre_all if year < 1983 
replace CAFC1 = ratepost_all if year >= 1986
label var CAFC1 "CAFC index constructed by all cases"

gen CAFC2 = ratepre_home if year < 1983 
replace CAFC2 = ratepost_home if year >= 1986
label var CAFC2 "CAFC index constructed by home circuit cases"

gen CAFC3 = ratepre_pattiff  if year < 1983
replace CAFC3 = ratepost_pattiff if year >= 1986
label var CAFC3 "CAFC index constructed by cases with patentee as the plaintiff"

gen CAFC4 = ratepre_home_pattiff if year < 1983
replace CAFC4 = ratepost_home_pattiff if year >= 1986
label var CAFC4 "CAFC index constructed by cases with patentee as the plaintiff tried at home circuit"

gen CAFC_wt = ratepre_all_wt if year < 1983 
replace CAFC_wt = ratepost_all_wt  if year >= 1986
label var CAFC_wt "CAFC index constructed by all cases (weighted by likelihood of case transfer)"

/** isic rev 3 **
merge m:1 sic3 using "$stata\rdext\sic3_isic_mapping.dta"
drop if _m == 2
drop _m

** correction for mapping using sic4 **
replace ISIC = 3230 if sic == 3663
replace ISIC = 3211 if sic == 3674 
rename ISIC isicr3 */ 

merge m:1 sic using "\other\concdce_ussic87_isicr3.dta", keepusing(isicr3 ) 
drop if _m == 2 
drop _m 

preserve 
use  "\other\ip_cohen.dta", clear 
egen median_pateff_prod = median(pateff_prod)
egen median_pateff_proc = median(pateff_proc)
gen high_pateff_prod = cond(pateff_prod > median_pateff_prod, 1, 0)
gen high_pateff_proc = cond(pateff_proc > median_pateff_proc, 1, 0)
keep isicr3 pateff_prod pateff_proc median_pateff_prod median_pateff_proc ///
 high_pateff_prod high_pateff_proc complex
tempfile temp 
save `temp', replace 
restore

merge m:1 isicr3 using `temp'
drop if _merge == 2
drop _m 

gen byte manuf = 1 if sic2 >= 20 & sic2 < 40
replace manuf = 0 if mi(manuf)

gen byte defense = 1 if inlist(sic3, 372, 376, 381) 
replace defense = 0 if mi(defense)

gen complex_isic = complex  
replace complex_isic = 0 if sic >= 200 & sic <= 230 & mi(complex_isic) 
*sic = 240, unclear match 
replace complex_isic = 0 if sic == 260 & mi(complex_isic) 
replace complex_isic = 0 if sic == 280 & mi(complex_isic) 
replace complex_isic = 0 if sic == 310 & mi(complex_isic) 
*sic = 360, unclear match 
*sic 2400, unclear match 
*sic 2430 unclear match 
*sic 3600 unclear match 
replace complex_isic = 1 if sic == 3620 & mi(complex_isic) 
replace complex_isic = 1 if sic == 3630  & mi(complex_isic) 
replace complex_isic = 1 if sic == 3640 & mi(complex_isic) 
replace complex_isic = 1 if sic == 3669 & mi(complex_isic) 
*communication equipment 
replace complex_isic = 1 if sic == 3670 & mi(complex_isic) 
replace complex_isic =1 if sic == 3690 & mi(complex_isic) 
*  3950 unclear match 
* 3990 unclear match 
replace complex_isic = 0 if sic >= 2000 & sic <= 2099 & mi(complex_isic) 
replace complex_isic = 0 if sic == 2100 & mi(complex_isic) 
replace complex_isic = 0 if sic >= 2200 & sic <= 2299  & mi(complex_isic) 
replace complex_isic = 0 if sic >= 2300 & sic <= 2399 & mi(complex_isic) 
*replace complex_isic = 0 if sic == 2400 | sic == 2430 & mi(complex_isic) 
*This matching is fuzzy, because there are complex under sic 24 and sic 243
replace complex_isic = 1 if sic >= 2510 & sic <= 2531 & mi(complex_isic) 
replace complex_isic = 0 if sic == 2540  & mi(complex_isic) 
replace complex_isic = 0 if sic == 2590  & mi(complex_isic) 
replace complex_isic = 0 if sic >= 2600 & sic <= 2679 & mi(complex_isic) 
replace complex_isic = 0 if sic >= 2750 & sic <= 2790 & mi(complex_isic) 
replace complex_isic = 0 if sic >= 2800 & sic < 3000 & mi(complex_isic) 
replace complex_isic = 0 if sic >= 3000 & sic <= 3080 & mi(complex_isic) 
replace complex_isic = 0 if sic >= 3100 & sic <= 3479 & mi(complex_isic) 
replace complex_isic = 1 if sic >= 3480 & sic <= 3489 & mi(complex_isic) 
replace complex_isic = 0 if sic >= 3490 & sic <= 3499 & mi(complex_isic) 
replace complex_isic = 1 if sic >= 3500 & sic <= 3599  & mi(complex_isic) 
*replace complex_isic = 1 if sic >= 3600 & sic <= 3690 & mi(complex_isic) 
*Fuzzy match here for sic 3600 , as complex = 0 for sic 3651 and 3652
replace complex_isic = 1 if sic >= 3720 & sic <= 3790 & mi(complex_isic) 
replace complex_isic = 0 if sic == 3960 & mi(complex_isic) 


gen xi_ln = ln(xi)
gen home_circuit = cond(circuit == cmp_circuit, 1, 0)
gen patcite_ln = ln(allcites + 1)  
gen patcite_inh = asinh(allcites)

label var patcite_ln "Forward citations (ln)"
label var fdate "Application date"
label var issdate  "Grant date"
label var year "Grant year"
 
*keep if year >= 1976 & year <= 1991  

 // limit sample to 7-year windows around years of major CAFC decisions, 1983-98

egen pd_ct = group(pdpass circuit)
xtset pd_ct
egen ct_yr = group(circuit year)
gen pcs = nbodyonly + nboth + nfrontonly
gen post = 0 if year < 1983
replace post = 1 if year > = 1986 
egen permt_cluster = group(circuit post)

gen CAFC = CAFC1

foreach i of varlist CON DIV CIP{
 gen CAFC_`i' = CAFC * `i'
 label var CAFC_`i' "CAFC x `i'" 
}

foreach i of varlist novelty_res min_words_age mean_words_age{
 gen `i'_inh = asinh(`i')
}
label var tobin "Tobin's Q"
label var emp_ln "Employees"
label var ppe_per_inh  "PPE per employee"
label var revt_per_inh "Revenue per employee"
label var CAFC "CAFC"
label var pcs "Citations to science"
order patent gvkey year circuit pd_ct

merge m:1 circuit using "\...\CAFC_ct_updated.dta"
drop if _merge == 2
drop _merge 

replace CAFC = ratepre_all if year <= 1982
replace CAFC = ratepost_all if year >= 1986
gen CAFC_post = gap_all * post 

egen y_m = group(year gmonth)
egen gvkey_sub = group(gvkey subcat)

global reps 1000
keep if manuf == 1 & year <= 1992 
global clu_var circuit 

gen pcs_inh = asinh(pcs)
gen nciting_inh = asinh(nciting)
gen nclaims_ln = asinh(nclaims)

global ctrvar emp_ln ppe_per_inh revt_per_inh xrd_per_inh  mi_rd tobin ///
       nclaims_ln  novelty_res_inh nciting_inh pcs_inh
	   
gen xrd_per_inh = asinh(xrd1/emp_ipolated)

reghdfe xi_ln  CAFC_post $ctrvar  if manuf == 1  , a(year##nclass) 
gen insample = e(sample) 
  
gen CAFC_emp = CAFC_post * high_emp_ipolated
gen CAFC_at = CAFC_post * high_at_adj
gen CAFC_ppe = CAFC_post * high_ppe_per 
gen CAFC_xrd = CAFC_post * high_xrd_per 

gen xi_ln_ori = xi_ln 
winsor2 xi, cut(1 99)
gen xi_ln_win = ln(xi_w)
replace xi_ln = xi_ln_win

preserve 
use "Patreg.dta", clear
keep isicr3 sic sic2 sic3 pateff_prod pateff_proc median_pateff_prod median_pateff_proc high_pateff_prod high_pateff_proc
duplicates drop 
bysort sic2: egen mean_pateff_prod = mean(pateff_prod)
bysort sic2: egen mean_pateff_proc = mean(pateff_proc)
gen pateff_prod_fillin = mean_pateff_prod
gen pateff_proc_fillin = mean_pateff_proc

keep sic2 pateff_prod_fillin pateff_proc_fillin
duplicates drop
egen median_pateff_prod = median(pateff_prod_fillin)
egen median_pateff_proc = median(pateff_proc_fillin)
gen high_prod_fillin = cond(pateff_prod_fillin > median_pateff_prod, 1, 0)
gen high_proc_fillin = cond(pateff_proc_fillin > median_pateff_proc, 1, 0)
keep sic2 high_prod_fillin high_proc_fillin pateff_prod_fillin pateff_proc_fillin
tempfile temp 
save `temp', replace 
restore

drop check* 
merge m:1 sic2 using `temp'
drop _merge

gen low_prod = 1 - high_prod_fillin
gen low_proc = 1 - high_proc_fillin
gen CAFC_post_prod = CAFC_post * low_prod
gen CAFC_post_proc = CAFC_post * low_proc
gen CAFC_prod = gap_all * low_prod
gen CAFC_proc = gap_all * low_proc

preserve 
use "\other\macro.dta", clear 
merge 1:1 state year using "\other\edu.dta"
keep if _merge == 3
drop _merge 
 
gen phd_st = edu_phd * popn / 1000000
gen rd_st = rnd_nom_ipolate/defltr_fed*100
rename popn popn_st 
 
keep state year phd_st rd_st popn_st  gdp_real

merge 1:1 state year using "fedapp_yrs.dta"
keep if _merge == 3
drop _merge 

foreach i in phd rd popn{
bysort circuit year: egen `i'_ct = sum(`i'_st)
}

foreach i in phd rd{
gen `i'_ct_percapita = `i'_ct/popn_ct   
}

bysort circuit year: egen gdp_real_ct = sum(gdp_real)
 
keep circuit year phd_ct_percapita rd_ct_percapita utsa_ct
duplicates drop
tempfile temp 
save `temp', replace 
restore 

merge m:1 circuit year using `temp'
drop if _merge == 2
drop _merge 

bysort pd_ct year: egen npat_pdct = nvals(patent)
gen phd_ct_inh = asinh(phd_ct_percapita)
gen rd_ct_inh = asinh(rd_ct_percapita)
gen phd_ct_perm = phd_ct_percapita * 1000000 
replace rd_ct_percapita = rd_ct_percapita * 1000000

merge m:1 circuit pdpass year using "pat_ncites_ct.dta"
drop if _merge == 2
drop _merge 
foreach i in  ncites_pd_ct ncites_pd ncites_pd_ct_rate{
    replace `i' = 0 if mi(`i')
}

gen temp = ncites_pd_ct_rate if ncites_pd_ct != 0
replace temp = 1 if ncites_pd_ct == 0 

gen gap_all_home = gap_all * temp
gen CAFC_home = gap_all * post * temp
gen CAFC_forshop = gap_all * post * (1 - temp)

compress
save "Patreg.dta", replace 
